Back to Article
Random Forest summary
Download Source

Random Forest summary

Alex Koiter

Load libraries

In [1]:
suppressPackageStartupMessages({
  library(tidyverse)
  library(patchwork)
  library(gt)
})
Warning: package 'ggplot2' was built under R version 4.4.3Warning: package 'patchwork' was built under R version 4.4.2

Importance

In [2]:
imp_data <- read_csv(here::here("./notebooks/importance_data.csv")) %>%
  rename("Terrain Attribute" = "Var.Names") %>%
  mutate(`Terrain Attribute` = fct_recode(`Terrain Attribute`, "Elevation" = "elevation", 
                                          "Catchment Area" = "catchment_area",
                                          "Plan Curvature" = "plan_curvature",
                                          "Profile Curvature" = "profile_curvature",
                                          "Rel. Slope Position" = "relative_slope_position",
                                          "SAGA Wetness Index" = "saga_wetness_index",
                                          "Vert. Dist. Channel" = "channel_network_distance")) %>%
  mutate(`Terrain Attribute` = fct_relevel(`Terrain Attribute`, "Elevation", "SAGA Wetness Index", "Rel. Slope Position", "Vert. Dist. Channel","Catchment Area", "Profile Curvature", "Plan Curvature")) %>%
  mutate(property = fct_recode(property, "italic(`a*`)" = "a_col", "italic(`b*`)" = "b_col", "italic(`c*`)" = "c_col", "italic(`h*`)" = "h_col", "italic(`x`)" = "x_col")) %>%
  mutate(property = fct_relevel(property, "Ca", "Co", "Cs", "Fe", "Li", "La", "Nb", "Ni", "Rb", "Sr", "italic(`a*`)", "italic(`b*`)", "italic(`c*`)", "italic(`h*`)", "italic(`x`)"))
Rows: 168 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Var.Names, site, property
dbl (4): %IncMSE, IncNodePurity, MSE_rank, Purity_rank

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.`summarise()` has grouped output by 'Terrain Attribute'. You can override using
the `.groups` argument.
In [3]:
p <- g2 + g1 + plot_layout(ncol = 1, heights = c(2, 10), guides = "collect") & theme(legend.position = 'right')
p
# ggsave(filename = "../Submission/Figure 4.png", plot = p, height = 130, width = 140, units = "mm", dpi = 600)
Figure 1: Heat map of the Random Forest regression results showing the ranking of the importance of terrain attributes (based on % increase in Mean Squared Error) in explaining the spatial variabilty of selected colour and geochemical properties within the agricultural and forested sites. Top panel shows an average ranking for each site and across both sites.

Performance

In [4]:
m2_performance <- read_csv(here::here("./notebooks/model_performance_data_49.csv")) |>
  mutate(property = fct_recode(property, "a*" = "a_col", "b*" = "b_col", "c*" = "c_col", "h*" = "h_col", "x" = "x_col", "Ca" = "ca", "Co" = "co", "Cs" = "cs", "Fe" = "fe", "Li" = "li", "La" = "la", "Nb" = "nb", "Ni" = "ni", "Rb" = "rb", "Sr" = "sr")) |>
  mutate(property = fct_relevel(property, "Ca", "Co", "Cs", "Fe", "Li", "La", "Nb", "Ni", "Rb", "Sr", "a*", "b*", "c*", "h*", "x")) |>
  rename("MSE_org" ="MSE", "R2_org" = "R2") |>
  arrange(property) 
Rows: 24 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): site, property
dbl (2): MSE, R2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
In [5]:
m_performance <- read_csv(here::here("./notebooks/model_performance_data.csv")) |>
  mutate(property = fct_recode(property, "a*" = "a_col", "b*" = "b_col", "c*" = "c_col", "h*" = "h_col", "x" = "x_col")) |>
  mutate(property = fct_relevel(property, "Ca", "Co", "Cs", "Fe", "Li", "La", "Nb", "Ni", "Rb", "Sr", "a*", "b*", "c*", "h*", "x")) |>
  mutate(Var_exp = Var_exp *100,
         Var_exp_test = Var_exp_test *100) |>
  arrange(property) |>
  left_join(m2_performance, by = c("site", "property"))
Rows: 22 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): site, property
dbl (6): MSE, Var_exp, MSE_test, Var_exp_test, R2, mse_test

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
In [6]:
In [7]:
m_performance |>
  group_by(site) |>
  gt() |>
  tab_spanner(label = "Training",
               columns = c(Var_exp, MSE)) |>
  tab_spanner(label = "Validation",
              columns = c(Var_exp_test, MSE_test)) |>
  tab_spanner(label = "Test",
              columns = c(R2, mse_test)) |>
    tab_spanner(label = "Original",
              columns = c(R2_org, MSE_org)) |>
  cols_move_to_start(columns = property) |>
  # tab_spanner(label = "Training", columns = c("MSE", "Var_exp")) |>
  # tab_spanner(label = "Validation", columns = c("MSE_test", "Var_exp_test")) |>
  # tab_spanner(label = "Testing", columns = "R2") |>
  fmt_number(columns = c("MSE_test", "MSE", "mse_test", "MSE_org"), decimal = 2) |>
  fmt_number(columns = c("Var_exp_test", "Var_exp"), decimal = 1) |>
  fmt_number(columns = c("R2", "R2_org"), decimal = 2) |>
  tab_style(style =  cell_text(weight = "bold", align = "center"), locations =  cells_row_groups()) |>
  tab_style(style =  cell_text(align = "left"), locations =  cells_body()) |>
  tab_style(style =  cell_text(weight = "bold", align = "left"), locations =  cells_column_labels()) |>
  # cols_label(Var_exp = md("% Var<br/>Training"), Var_exp_test = md("% Var<br/>Validation"), MSE_test = md("MSE<br/>Testing"), MSE = md("MSE<br/>Training"), R2 = md("{{R^2}}<br/>Testing"), property = "Property") |>
    cols_label(Var_exp = "% Variance", Var_exp_test = "% Variance", MSE_test = "MSE", MSE = "MSE", R2 = "{{R^2}}", R2_org = "{{R^2}}", property = "Property", mse_test = "MSE", MSE_org = "MSE") |>
  row_group_order(groups = c("Agriculture", "Forest"))  |>
  tab_style(
    style = list(cell_text(style = "italic")),
    locations = cells_body(
      columns = property,
      rows = property %in% c("a*", "b*", "c*", "h*", "x"))) |>
    tab_footnote(
    footnote = "Mean square error",
    locations = cells_column_labels(columns = c("mse_test", "MSE_test", "MSE", "MSE_org"))) |>
  tab_footnote(
    footnote = "Percent variance explained",
    locations = cells_column_labels(columns = c("Var_exp", "Var_exp_test"))) 
Table 1: Model summary and performance statistics for the random forest regression using the training, validation, test and original (non-interpolated) data sets.
Property
Training
Validation
Test
Original
% Variance1 MSE2 % Variance1 MSE2 R2 MSE2 R2 MSE2
Agriculture
Ca 91.6 0.37 91.8 0.36 0.91 0.38 0.95 0.23
Co 79.8 0.09 82.5 0.08 0.80 0.08 0.88 0.08
Cs 85.7 0.00 86.4 0.00 0.85 0.00 0.92 0.00
Fe 69.6 0.00 70.9 0.00 0.69 0.00 0.83 0.00
Li 59.3 0.54 59.8 0.53 0.64 0.51 0.88 0.24
La 93.0 0.05 93.1 0.04 0.93 0.05 0.96 0.03
Nb 57.3 0.00 59.1 0.00 0.55 0.00 0.71 0.00
Ni 93.1 0.34 93.7 0.33 0.93 0.34 0.95 0.25
Rb 95.3 0.73 96.1 0.64 0.95 0.79 0.98 0.39
Sr 93.5 97.22 93.6 93.97 0.93 105.59 0.97 44.77
a* 85.0 0.01 86.9 0.01 0.85 0.01 0.91 0.00
b* 72.5 0.14 75.3 0.12 0.72 0.15 0.89 0.09
c* 73.2 0.15 75.9 0.14 0.73 0.17 0.89 0.10
h* 58.3 0.00 58.6 0.00 0.56 0.00 0.73 0.00
x 73.3 0.00 73.6 0.00 0.69 0.00 0.82 0.00
Forest
Co 39.1 0.24 42.9 0.23 0.48 0.21 0.77 0.29
Cs 64.1 0.00 67.1 0.00 0.66 0.00 0.86 0.00
Li 41.3 0.28 42.0 0.28 0.46 0.28 0.66 0.26
La 43.3 1.40 47.5 1.32 0.48 1.23 0.78 0.60
Nb 55.0 0.00 55.9 0.00 0.58 0.00 0.84 0.00
Sr 59.4 29.43 59.1 29.66 0.59 29.25 0.82 13.88
h* 58.8 0.00 60.3 0.00 0.62 0.00 0.86 0.00
1 Percent variance explained
2 Mean square error